library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.3.4 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(here)
## here() starts at /Users/hlynur/wd/ml_case_studies
library(glue)
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
sisters <- read_csv(file = here("data", "sisters67.csv"))
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
sisters <- sisters %>%
mutate(age = (v181 + 1) * 10) %>%
select(sister, age, glue("v{116:180}"))
sisters
sisters %>%
ggplot(mapping = aes(x = age)) +
geom_histogram(binwidth = 10)
## Warning: Removed 5030 rows containing non-finite values (stat_bin).

tidy_sisters <- sisters %>%
select(-sister) %>%
gather(key, value, -age)
tidy_sisters %>%
group_by(age) %>%
summarize(medaltal = mean(value, na.rm = TRUE))
tidy_sisters %>%
count(value)
tidy_sisters %>%
filter(complete.cases(.),
key %in% glue("v{153:170}")) %>%
group_by(key, value) %>%
summarise(age = mean(age)) %>%
ggplot(aes(value, age, color = key)) +
geom_path(alpha = 0.5, size = 1.5) +
geom_point(size = 2) +
facet_wrap(~key) +
guides(color = FALSE)

sisters_select <- sisters %>%
select(-sister) %>%
filter(complete.cases(.))
simple_lm <- lm(age ~.,
data = sisters_select)
summary(simple_lm)
##
## Call:
## lm(formula = age ~ ., data = sisters_select)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.040 -9.572 -1.120 8.946 63.563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.7590627 0.5235690 54.929 < 2e-16 ***
## v116 -0.5987470 0.0386810 -15.479 < 2e-16 ***
## v117 -0.0565771 0.0440215 -1.285 0.198721
## v118 -0.8286660 0.0420266 -19.718 < 2e-16 ***
## v119 -0.2928765 0.0412276 -7.104 1.22e-12 ***
## v120 -0.0414388 0.0373361 -1.110 0.267053
## v121 -0.1015904 0.0387306 -2.623 0.008718 **
## v122 0.0366553 0.0403965 0.907 0.364205
## v123 -1.0233578 0.0448046 -22.840 < 2e-16 ***
## v124 0.5335434 0.0430399 12.396 < 2e-16 ***
## v125 0.4809905 0.0368283 13.060 < 2e-16 ***
## v126 0.3458564 0.0422866 8.179 2.91e-16 ***
## v127 -0.0192932 0.0392643 -0.491 0.623168
## v128 0.0009816 0.0377115 0.026 0.979234
## v129 0.0803219 0.0386387 2.079 0.037639 *
## v130 0.5897471 0.0398385 14.803 < 2e-16 ***
## v131 -0.5465149 0.0420680 -12.991 < 2e-16 ***
## v132 0.0988385 0.0459866 2.149 0.031614 *
## v133 -0.5773012 0.0374372 -15.421 < 2e-16 ***
## v134 -0.1041877 0.0453769 -2.296 0.021676 *
## v135 0.3732995 0.0542140 6.886 5.80e-12 ***
## v136 -0.1364003 0.0368229 -3.704 0.000212 ***
## v137 0.6211468 0.0505179 12.296 < 2e-16 ***
## v138 0.2985449 0.0523696 5.701 1.20e-08 ***
## v139 1.4131884 0.0520453 27.153 < 2e-16 ***
## v140 -0.7349810 0.0366384 -20.060 < 2e-16 ***
## v141 0.3995640 0.0462512 8.639 < 2e-16 ***
## v142 -0.2730768 0.0414278 -6.592 4.38e-11 ***
## v143 0.0755909 0.0417660 1.810 0.070320 .
## v144 1.0629185 0.0536219 19.822 < 2e-16 ***
## v145 -0.6436678 0.0405920 -15.857 < 2e-16 ***
## v146 -0.4368924 0.0402487 -10.855 < 2e-16 ***
## v147 -0.4288801 0.0487510 -8.797 < 2e-16 ***
## v148 1.4320922 0.0452772 31.629 < 2e-16 ***
## v149 -0.2656667 0.0423536 -6.273 3.57e-10 ***
## v150 -0.2451097 0.0418002 -5.864 4.54e-09 ***
## v151 0.7228294 0.0443463 16.300 < 2e-16 ***
## v152 0.0565705 0.0405800 1.394 0.163307
## v153 -0.6064906 0.0454401 -13.347 < 2e-16 ***
## v154 1.0710276 0.0387038 27.672 < 2e-16 ***
## v155 0.8692618 0.0490792 17.711 < 2e-16 ***
## v156 1.2491486 0.0454012 27.514 < 2e-16 ***
## v157 0.1566712 0.0410294 3.819 0.000134 ***
## v158 -0.0917289 0.0413971 -2.216 0.026706 *
## v159 -0.2252867 0.0412120 -5.467 4.60e-08 ***
## v160 0.2028421 0.0425102 4.772 1.83e-06 ***
## v161 0.7971671 0.0433414 18.393 < 2e-16 ***
## v162 -0.2863248 0.0496361 -5.768 8.03e-09 ***
## v163 -1.3637209 0.0459544 -29.676 < 2e-16 ***
## v164 -0.3758315 0.0346484 -10.847 < 2e-16 ***
## v165 -0.4915162 0.0450373 -10.914 < 2e-16 ***
## v166 0.1195126 0.0417274 2.864 0.004183 **
## v167 -0.1726421 0.0406119 -4.251 2.13e-05 ***
## v168 0.3797206 0.0466675 8.137 4.12e-16 ***
## v169 1.4159979 0.0428306 33.060 < 2e-16 ***
## v170 1.2342274 0.0450740 27.382 < 2e-16 ***
## v171 0.8843819 0.0415874 21.266 < 2e-16 ***
## v172 -0.0140677 0.0356989 -0.394 0.693535
## v173 0.7792050 0.0344868 22.594 < 2e-16 ***
## v174 0.2376283 0.0390072 6.092 1.12e-09 ***
## v175 0.1756163 0.0392362 4.476 7.62e-06 ***
## v176 0.8291804 0.0445449 18.614 < 2e-16 ***
## v177 -0.0007449 0.0476089 -0.016 0.987517
## v178 -0.0239207 0.0411134 -0.582 0.560688
## v179 0.2843894 0.0459248 6.193 5.95e-10 ***
## v180 0.0481848 0.0422827 1.140 0.254462
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.04 on 77046 degrees of freedom
## Multiple R-squared: 0.3386, Adjusted R-squared: 0.338
## F-statistic: 606.8 on 65 and 77046 DF, p-value: < 2.2e-16
set.seed(1234)
in_train <- createDataPartition(sisters_select$age,
p = 0.6,
list = FALSE)
training <- sisters_select[in_train, ]
validation_test <- sisters_select[-in_train, ]
in_test <- createDataPartition(validation_test$age,
p = 0.5,
list = FALSE)
testing <- validation_test[in_test, ]
validation <- validation_test[-in_test, ]
#CART
sisters_cart <- train(x = select(training, -age),
y = training$age,
method = "rpart")
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning: Setting row names on a tibble is deprecated.
# ## xgboost
# sisters_rf <- train(age ~ ., method = "xgbLinear", data = training)
#
# ## gbm
# sisters_gbm <- train(age ~ ., method = "gbm", data = training)
library(yardstick)
## Loading required package: broom
##
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
##
## mnLogLoss, precision, recall
## The following object is masked from 'package:readr':
##
## spec
model_results <- validation %>%
mutate(CART = predict(sisters_cart, validation))
model_results %>%
metrics(truth = age, estimate = CART)